House Hold Energy Data - Time Series

About Dataset

Introduction

This data was collected for an apartment unit in San Jose for one plus year. The data is collected with smart meters and shared by the energy company. This is time-series data by nature and can be used for various time-series Machine Learning experiments.

Description of Data

The data contains eight attributes.

TYPE - This is an information column. The value is ‘Electric usage’ for all the observations.
DATE - Date of electric consumption. There is no timestamp in this field.
START TIME - Start time of the consumption.
END TIME - End time of the consumption
USAGE - Consumption in kWh
UNITS - This column denotes measurement unit. It is kWh for all the observations.
COST - Cost of consumption in $.
NOTES - Mostly an empty column.

PRE-PROCESSING DATA

Reading the data

library(stringr)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(urca)
library(tseries)
library(fpp2)
library(forecast)
options(warn=-1)
data<-read.csv("/Users/shubhammishra/Desktop/VII Semester/ECO619- Time Series Analysis and Forecasting/Assignment/Data/archive/D202.csv")

Sub-setting rows

data$COST<-as.numeric(gsub("$", "",as.character(data$COST), fixed = T))
data$year <- gsub("/", "",as.character(data$DATE), fixed = T)
data$year <- str_sub(data$year,-4)
data<-subset(data, year == '2017')

Sub-setting column

data$time<-as.character(paste(data$DATE, data$START.TIME))
data <- data[!names(data) %in% c("year", "TYPE", "DATE", "START.TIME", "END.TIME", "UNITS", "NOTES")]
colnames(data)<-c("use","cost","time")
data <- data %>% select("time",everything())

Data reduction row-wise - 1 minute data

Nth.ddatate<-function(dataframe, n)dataframe[-(seq(n,to=nrow(dataframe),by=n)),]
i <- 4
dummy<-data
while(i >= 2){
  dummy<-Nth.ddatate(dummy, i)
  i <- i - 1
}
i <- 4
data<-data %>%
  group_by(time = gl(ceiling(nrow(data)/i), i, nrow(data))) %>%
  summarise_each(funs(sum))
data$time<-dummy$time

Keeping only the imp object in the environment

rm(list = setdiff(ls(), c("data")))

Summary

summary(data)
##      time                use              cost        
##  Length:8760        Min.   :0.0000   Min.   :0.00000  
##  Class :character   1st Qu.:0.1200   1st Qu.:0.04000  
##  Mode  :character   Median :0.1600   Median :0.04000  
##                     Mean   :0.4835   Mean   :0.09604  
##                     3rd Qu.:0.4000   3rd Qu.:0.08000  
##                     Max.   :9.4400   Max.   :2.52000
str(data)
## tibble [8,760 × 3] (S3: tbl_df/tbl/data.frame)
##  $ time: chr [1:8760] "1/1/2017 0:00" "1/1/2017 1:00" "1/1/2017 2:00" "1/1/2017 3:00" ...
##  $ use : num [1:8760] 1.48 1.4 1.4 1.4 1.32 1.36 1.36 1.32 1.28 0.36 ...
##  $ cost: num [1:8760] 0.28 0.24 0.24 0.24 0.24 0.24 0.24 0.24 0.24 0.08 ...

WORKING DATA FOR ANALYSIS

df<-data

Time-series plot for the data

ggplot()+
  geom_line(data=df, aes(x = time, y = use, group=1)) + ylab('Usage')+xlab('Time index') + theme_clean()

Apply time series function, with monthly(8760/12) freq

count_ma<-ts(df$use,frequency = 8760/12)

Periodic decomposition

decomp<-stl(count_ma,"periodic")

Seasonally adjust the decomposition

deseasonal_cnt<-seasadj(decomp)  #Returns seasonally adjusted data constructed by removing the seasonal component

Plotting

plot(decomp) #seasonality visible

Testing for stationarity

plot(df$use, type='l')   #visual check for stationary variance

adf.test(count_ma, alternative = 'stationary')   #adf test
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_ma
## Dickey-Fuller = -14.801, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary

adf test shows stationarity at 1% significance level

Auto correlations

Acf(count_ma, main='', lag.max = 30)  #corr btw the time-series and its lag

Pacf(count_ma, main='', lag.max = 30)  #corr btw the time-series and its lag using partial autocorrelation function

Taking into seasonal difference, and rechecking

Re-testing for stationarity

ss <- 1  #difference counter
count_d1 <- diff(deseasonal_cnt, differences = ss)
plot(count_d1)  #visual check

adf.test(count_d1, alternative = "stationary")  #adf test
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_d1
## Dickey-Fuller = -35.857, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary

adf test shows stationarity at 1% significance level

Re-testing auto correlations

Acf(count_d1, main='', lag.max = 30)  #corr btw the time-series and its lag

Pacf(count_d1, main='', lag.max = 30)  #corr btw the time-series and its lag using partial autocorrelation function

MODEL FITTING

Fitting ARIMA model

fit1 <- auto.arima(deseasonal_cnt, seasonal = FALSE)  #auto mode gives the most like version for arima
fit1   #gives ARIMA(0,1,1)
## Series: deseasonal_cnt 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.9773
## s.e.   0.0033
## 
## sigma^2 = 0.7356:  log likelihood = -11084.93
## AIC=22173.86   AICc=22173.86   BIC=22188.02
tsdisplay(residuals(fit1), lag.max = 30, main = 'Model Residuals from ARIMA(0,1,1)')  #checking the residuals

Trying other fittings

fit2 <- arima(deseasonal_cnt, order=c(2,0,24)) # from previous plot - experimental values
fit2  #is the AIC lower?
## 
## Call:
## arima(x = deseasonal_cnt, order = c(2, 0, 24))
## 
## Coefficients:
##          ar1     ar2      ma1      ma2      ma3      ma4     ma5      ma6
##       0.5604  0.4110  -0.1496  -0.5702  -0.1967  -0.0382  0.0122  -0.0151
## s.e.  0.1292  0.1277   0.1288   0.0757   0.0452   0.0148  0.0128   0.0129
##           ma7      ma8     ma9     ma10    ma11    ma12     ma13     ma14
##       -0.0565  -0.0052  0.0326  -0.0289  0.0671  0.1173  -0.0023  -0.0622
## s.e.   0.0130   0.0141  0.0129   0.0133  0.0134  0.0168   0.0163   0.0141
##          ma15     ma16    ma17     ma18     ma19     ma20    ma21    ma22
##       -0.0224  -0.0060  0.0201  -0.0317  -0.0248  -0.0167  0.0083  0.0324
## s.e.   0.0142   0.0129  0.0129   0.0124   0.0150   0.0130  0.0130  0.0131
##         ma23    ma24  intercept
##       0.0206  0.0420     0.4930
## s.e.  0.0132  0.0118     0.0367
## 
## sigma^2 estimated as 0.6106:  log likelihood = -10269.42,  aic = 20594.85
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(2,0,24)')

FORECASTING

Predicting new future values after 8760 obs

autoplot(forecast(fit1)) 

Predict from data point 8000 to 8760 over the test data

hold<-window(ts(deseasonal_cnt),start(8000))
mod_fit<- arima(ts(deseasonal_cnt[-c(8000:8760)]), order = c(2,0,24))
ff<-forecast(mod_fit, h=759)
plot(ff, main="") #plotting the forecast
lines(ts(deseasonal_cnt)) #actual value over the forecast values

Bringing back seasonality and again forecasting

Again fitting arima model with seasonality

sfit1 <- auto.arima(deseasonal_cnt, seasonal = TRUE)
tsdisplay(residuals(sfit1), lag.max = 30, main = 'Residuals from auto ARIMA with seasonality')

Forecasting with seasonality

sff<-forecast(sfit1, h=760)
plot(sff, main="Auto Arima with seasonality")
lines(ts(deseasonal_cnt))

Accuracy Testing with selected models

Compiling different fit models

Fitting auto ARIMA with seasonality

sfit1 <- auto.arima(deseasonal_cnt, seasonal = TRUE)
tsdisplay(residuals(sfit1), lag.max = 30, main = 'Model Residuals from auto ARIMA with seasonality')

Fitting auto ARIMA withot seasonality

fit1 <- auto.arima(deseasonal_cnt, seasonal = FALSE)
tsdisplay(residuals(fit1), lag.max = 30, main = 'Model Residuals from ARIMA(0,1,1)')

Fitting arima(2,0,24)

fit2 <- arima(deseasonal_cnt, order=c(2,0,24))
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(2,0,24)')

Fitting arima(1,1,1)

fit3 <- arima(deseasonal_cnt, order=c(1,1,1))
tsdisplay(residuals(fit2), lag.max = 30, main = 'Model Residuals from ARIMA(1,1,1)')

Fitting ETS (error, trend, seasonality) model

fit10 <- ets(df$use)  #ETS model of the original data
plot(fit10)

tsdisplay(residuals(fit10), lag.max = 30, main = 'ETS model')

Visual accuracy, forecasting different fit models together

par(mfrow=c(2,3))

Auto ARIMA with seasonality

sff1<-forecast(sfit1, h=760)
plot(sff1)
lines(ts(deseasonal_cnt))

Auto ARIMA without seasonality

ff1<-forecast(fit1, h=760)
plot(ff1)
lines(ts(deseasonal_cnt))

ARIMA(2,0,24)

ff2<-forecast(fit2, h=760)
plot(ff2)
lines(ts(deseasonal_cnt))

ARIMA(1,1,1)

ff3<-forecast(fit3, h=760)
plot(ff3)
lines(ts(deseasonal_cnt))

ETS model

ff10<-forecast(fit10, h=760)
plot(ff10)
lines(ts(deseasonal_cnt))

Accuracy checking –> lower the MAPE (mean absolute percentage error) value better the accuracy

accuracy(sfit1)  #auto arima with seasonality
##                        ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.002743618 0.7902171 0.4319475 15.42881 302.9777 0.8019148
##                   ACF1
## Training set -0.001635
accuracy(fit1)   #auto arima without seasonality
##                        ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.002649596 0.8575934 0.4841065 4.942987 361.1992 0.8987486
##                   ACF1
## Training set 0.3609823
accuracy(fit2)   #arima(2,0,24)
##                        ME      RMSE       MAE     MPE     MAPE      MASE
## Training set -0.001707066 0.7813777 0.4239704 26.4341 310.2939 0.8740648
##                      ACF1
## Training set -0.001212339
accuracy(fit3)   #arima(1,1,1)
##                       ME      RMSE       MAE      MPE     MAPE      MASE
## Training set -0.00349833 0.7973942 0.4284029 15.34037 311.1938 0.8832028
##                    ACF1
## Training set 0.03951601
accuracy(fit10)  #ets model
##                        ME      RMSE      MAE  MPE MAPE     MASE      ACF1
## Training set -0.002375444 0.8952574 0.474522 -Inf  Inf 1.193445 0.3560017